This document demonstrates the use of the bRF and LASSO-D3S functions for integrative GRN inference.
Those functions infer the regulatory pathways of Arabidopsis thaliana’s roots in response to nitrate (N) induction from Varala et al., 2018.
They use as inputs the expression profiles of N-responsive genes and TFBS information. Prior TFBS information was built by searching in the promoters of the N-responsive genes the PWM of the N-responsive regulators.
Import of the expression data and the N-responsive genes and regulators :
load('rdata/inference_input_N_response_varala.rdata')
genes <- input_data$grouped_genes; length(genes)
## [1] 1426
tfs <- input_data$grouped_regressors; length(tfs)
## [1] 201
counts <- input_data$counts; dim(counts)
## [1] 1426 45
load("rdata/pwm_occurrences_N_response_varala.rdata")
dim(pwm_occurrence)
## [1] 1426 201
ALPHAS <- seq(0,1, by = 0.1)
subset <- sample(genes, replace = F, size = 20)
#subset <- genes
load("results/brf_perumtations_mse_all_genes.rdata")
load("results/100_permutations_bRF_importances.rdata")
draw_gene_mean_sd <- function(gene, title = NULL){
lmses[gene, ] %>%
gather() %>%
separate(key,
into = c("alpha", "rep", "MSEtype"),
sep = " ") %>%
group_by(alpha, MSEtype) %>%
mutate(mean_mse = mean(value, na.rm = T),
sd_mse = sd(value, na.rm = T)) %>%
ggplot(aes(
x = as.numeric(alpha),
y = value,
color = MSEtype,
fill = MSEtype
)) +ggtitle(paste(gene, title))+ylab("MSE/Var(gene)")+
geom_ribbon(aes(ymin = mean_mse - sd_mse ,
ymax = mean_mse + sd_mse ),
alpha = .4) +theme_pubr(legend = "top")+
geom_point(alpha = 0.1) + geom_smooth(se=F)+xlab("alpha")
}
draw_gene_effective_integration <- function(gene){
tfs_with_motif <- names(which(pwm_occurrence[gene,]==1))
data.frame(lapply(mats, function(mat){mean(rank(mat[,gene])[tfs_with_motif])})) %>%
gather(key = "setting", value = "summed_importance") %>%
separate(setting, into = c("method", "alpha", "dataset", "rep"), sep = "_") %>%
group_by(alpha, dataset) %>%
mutate(mean_imp = mean(summed_importance),
sd_imp = sd(summed_importance),
alpha = as.numeric(alpha))%>%
ggplot(aes(x=alpha, y = summed_importance, color = dataset, fill = dataset)) +
geom_point(alpha = 0.1) + geom_smooth(se=F)+
geom_ribbon(aes(ymin = mean_imp - sd_imp ,
ymax = mean_imp + sd_imp ),
alpha = .4) +theme_pubr(legend = "top") +
ylab("effective data integration")
}
draw_MSE_vs_gene_effective_integration <- function(gene, return_cluster = F){
tfs_with_motif <- names(which(pwm_occurrence[gene,]==1))
if(return_cluster & length(tfs_with_motif) == 0)
return("not saved")
if(length(tfs_with_motif)==0)
return((ggplot()+ ggtitle ("no motif in this gene")))
inte <- data.frame(lapply(mats, function(mat){mean(rank(mat[,gene])[tfs_with_motif])})) %>%
gather(key = "setting", value = "summed_importance") %>%
separate(setting, into = c("method", "alpha", "dataset", "rep"), sep = "_") %>%
group_by(alpha, dataset) %>%
mutate(mean_imp = mean(summed_importance),
sd_imp = sd(summed_importance),
alpha = as.numeric(alpha))
curves <- lmses[gene, ] %>%
gather() %>%
separate(key,
into = c("alpha", "rep", "MSEtype"),
sep = " ") %>%
group_by(alpha, MSEtype) %>%
rename(dataset = MSEtype)%>%
mutate(mean_mse = mean(value, na.rm = T),
sd_mse = sd(value, na.rm = T),
alpha = as.numeric(alpha),
dataset = str_replace(dataset, "true_data", "trueData")) %>%
full_join(inte, by = c("alpha", "dataset", "rep")) %>%
group_by(dataset) %>%
dplyr::arrange(summed_importance) %>%
mutate(lower = loess.sd(x=summed_importance, y=value)$lower,
upper = loess.sd(x=summed_importance, y=value)$upper) %>%
mutate(lower = frollmean(lower, 1, fill = NA),
upper = frollmean(upper, 1, fill = NA))
true <- curves %>%
filter(dataset == "trueData")
shuff <- curves %>%
filter(dataset != "trueData")
true[paste0("shuffled_", c("mean_mse", "lower", "upper"))] <- shuff[c("mean_mse", "lower", "upper")]
dev <- true%>%
mutate(deviation = ifelse(shuffled_lower - upper < 0 , 0, shuffled_lower - upper))
cluster = "not saved"
if(sum(dev$deviation > 0, na.rm = T) > 10) cluster = "class of interest"
if(return_cluster) return(cluster)
curves %>%
ggplot(aes(y=value, x = summed_importance, color = dataset, fill = dataset))+
geom_ribbon(aes(ymin =lower ,
ymax = upper ),
alpha = .4)+ geom_point(alpha = 0.1, size = 0.5) +
theme_pubr(legend = "top") + geom_smooth(method = "loess", se = F)+
ylab("MSE/Var(gene)") + xlab("effective integration") + ggtitle(paste(gene, cluster))
}
draw_MSE_vs_gene_effective_integration_mean <- function(gene, return_cluster = F){
tfs_with_motif <- names(which(pwm_occurrence[gene,]==1))
if(return_cluster & length(tfs_with_motif) == 0)
return("not saved")
if(length(tfs_with_motif)==0)
return((ggplot()+ ggtitle ("no motif in this gene")))
inte <- data.frame(lapply(mats, function(mat){mean(rank(mat[,gene])[tfs_with_motif])})) %>%
gather(key = "setting", value = "summed_importance") %>%
separate(setting, into = c("method", "alpha", "dataset", "rep"), sep = "_") %>%
group_by(alpha, dataset) %>%
mutate(mean_imp = mean(summed_importance),
sd_imp = sd(summed_importance),
alpha = as.numeric(alpha))
curves <- lmses[gene, ] %>%
gather() %>%
separate(key,
into = c("alpha", "rep", "MSEtype"),
sep = " ") %>%
rename(dataset = MSEtype,)%>%
mutate(alpha = as.numeric(alpha),
dataset = str_replace(dataset, "true_data", "trueData"))%>%
full_join(inte, by = c("alpha", "dataset", "rep")) %>%
group_by(mean_imp, dataset) %>%
mutate(mean_mse = mean(value, na.rm = T),
sd_mse = sd(value, na.rm = T))
curves %>%
ggplot(aes(y=value, x = mean_imp, color = dataset, fill = dataset))+
geom_ribbon(aes(ymin =mean_mse-sd_mse ,
ymax = mean_mse + sd_mse), alpha = .4)+
geom_point(alpha = 0.1, size = 0.5) +
theme_pubr(legend = "top") + geom_smooth(method = "loess", se = F)+
ylab("MSE/Var(gene)") + xlab("effective integration") + ggtitle(paste(gene))
}
draw_MSE_vs_gene_effective_integration_t_test <- function(gene, return_cluster = F){
tfs_with_motif <- names(which(pwm_occurrence[gene,]==1))
if(return_cluster & length(tfs_with_motif) == 0)
return("not saved")
if(length(tfs_with_motif)==0)
return((ggplot()+ ggtitle ("no motif in this gene")))
inte <- data.frame(lapply(mats, function(mat){mean(rank(mat[,gene])[tfs_with_motif])})) %>%
gather(key = "setting", value = "summed_importance") %>%
separate(setting, into = c("method", "alpha", "dataset", "rep"), sep = "_") %>%
group_by(alpha, dataset) %>%
mutate(mean_imp = mean(summed_importance),
sd_imp = sd(summed_importance),
alpha = as.numeric(alpha))
lmses[gene, ] %>%
gather() %>%
separate(key,
into = c("alpha", "rep", "MSEtype"),
sep = " ") %>%
group_by(alpha, MSEtype) %>%
rename(dataset = MSEtype)%>%
mutate(mean_mse = mean(value, na.rm = T),
sd_mse = sd(value, na.rm = T),
alpha = as.numeric(alpha),
dataset = str_replace(dataset, "true_data", "trueData")) %>%
full_join(inte, by = c("alpha", "dataset", "rep")) %>%
group_by(dataset)%>%
mutate(mean_mse = mean(value, na.rm=T)) %>%
ggplot(aes(x=dataset, y=value, color = dataset)) + geom_jitter(size=0.5) + geom_boxplot() +
stat_compare_means(method = "t.test") +
geom_point(aes(x=dataset, y=mean_mse), size = 2, col = 'black') +
ylab("MSE/Var(gene)") + xlab("effective integration") +
ggtitle(paste(gene)) + theme_pubr()
}
gene <-"AT1G08090"
# clusters_rf_eff <- sapply(genes, draw_MSE_vs_gene_effective_integration, return_cluster=T)
# table(clusters_rf_eff))
# # # save(clusters_rf_eff, file = "results/clusters_mse_eff_bRF_100permutations.rdata")
# #
# load("results/clusters_mse_bRF_100permutations.rdata")
# pos_genes <- names(which(clusters_rf==2))
# load("results/clusters_mse_eff_bRF_100permutations.rdata")
# pos_genes_eff <- names(which(clusters_rf_eff=="class of interest"))
#
# length(intersect(pos_genes, pos_genes_eff))
#
#
# dev%>%
# ggplot(aes(x=summed_importance, y=deviation)) + geom_point()+
# draw_MSE_vs_gene_effective_integration(gene)
for(gene in sample(genes, replace = F, size = 10)){
print(draw_gene_effective_integration(gene)+draw_gene_mean_sd(gene)+
draw_MSE_vs_gene_effective_integration(gene)+
draw_MSE_vs_gene_effective_integration_mean(gene)+
plot_layout(ncol=4))
}
#
#genes saved with new method
# for(gene in setdiff(pos_genes_eff, pos_genes)){
# print(draw_MSE_vs_gene_effective_integration(gene))
# }
# genes lost with new method
# for(gene in sample(setdiff(pos_genes, pos_genes_eff), size = 10, replace = F)){
# print(draw_gene_effective_integration(gene)+draw_gene_mean_sd(gene)+draw_MSE_vs_gene_effective_integration(gene))
# }
# #
# gene <- "AT5G01480"
# pwm_occurrence["AT1G08090", "AT5G10030"]
load("results/clusters_mse_bRF_100permutations.rdata")
pos_genes <- names(which(clusters_rf==2))
# for(gene in sample(pos_genes,10, replace = F)){
# print(draw_gene_effective_integration(gene)+draw_gene_mean_sd(gene)+draw_MSE_vs_gene_effective_integration(gene))
# }
for(gene in sample(pos_genes, replace = F, size = 10)){
print(draw_gene_effective_integration(gene)+draw_gene_mean_sd(gene)+
draw_MSE_vs_gene_effective_integration(gene)+
draw_MSE_vs_gene_effective_integration_mean(gene)+
plot_layout(ncol=4))
}
load("results/lasso_perumtations_mse_all_genes.rdata")
load("results/100_permutations_lasso_importances_mda_shuffle.rdata")
for(gene in sample(genes, replace = F, size = 10)){
print(draw_gene_effective_integration(gene)+draw_gene_mean_sd(gene)+
draw_MSE_vs_gene_effective_integration(gene)+
draw_MSE_vs_gene_effective_integration_mean(gene)+
plot_layout(ncol=4))
}
load("results/clusters_mse_lasso_100permutations.rdata")
pos_genes <- names(which(clusters_lasso==1))
for(gene in sample(pos_genes, replace = F, size = 10)){
print(draw_gene_effective_integration(gene)+draw_gene_mean_sd(gene)+
draw_MSE_vs_gene_effective_integration(gene)+
draw_MSE_vs_gene_effective_integration_mean(gene)+
plot_layout(ncol=4))
}